In [5]:
# imports
import math
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.__version__
Out[5]:
In [6]:
spark
Out[6]:
In [7]:
sc
Out[7]:
In [11]:
# textFile = sc.textFile("data/README.md")
textFile = sc.textFile("/usr/local/Cellar/apache-spark/2.0.2/README.md")
In [12]:
textFile.take(10)
Out[12]:
In [13]:
linesWithSpark = textFile.filter(lambda line: "Spark" in line)
In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print "Model 1 was fit using parameters: "
print model1.extractParamMap()
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
# You can combine paramMaps, which are python dictionaries.
paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
print "Model 2 was fit using parameters: "
print model2.extractParamMap()
# Prepare test data
test = spark.createDataFrame([
(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
(0.0, Vectors.dense([3.0, 2.0, -0.1])),
(1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
selected = prediction.select("features", "label", "myProbability", "prediction")
for row in selected.collect():
print row
from pyspark.mllib.util import MLUtils
# convert DataFrame columns
convertedVecDF = MLUtils.convertVectorColumnsToML(vecDF)
convertedMatrixDF = MLUtils.convertMatrixColumnsToML(matrixDF)
# convert a single vector or matrix
mlVec = mllibVec.asML()
mlMat = mllibMat.asML()
In [15]:
sqlCtx
Out[15]:
In [16]:
rdd = sc.parallelize(range(1000), 20)
rdd.getNumPartitions()
Out[16]:
In [17]:
rdd.map(lambda r: (round(r/100)*100, 1)).reduceByKey(lambda x,y: x+y).collect()
Out[17]:
In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
(0, "a b c d e spark", 1.0),
(1, "b d", 0.0),
(2, "spark f g h", 1.0),
(3, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
(4, "spark i j k"),
(5, "l m n"),
(6, "mapreduce spark"),
(7, "apache hadoop")], ["id", "text"])
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "prediction")
for row in selected.collect():
print(row)
In [19]:
from ipywidgets import widgets
from IPython.display import display
text = widgets.Text()
display (text)
def handle_submit(sender):
print text.value
text.on_submit(handle_submit)
In [20]:
from IPython.html.widgets import *
from numpy import arange
t = arange(0.0, 1.0, 0.01)
def pltsin(f):
plt.plot(t, math.sin(2*math.pi*t*f))
plt.show()
interact(pltsin, f=(1,10,0.1))
In [21]:
arange(0.0, 1.0, 0.01)
Out[21]:
In [22]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import *
from IPython.display import display
from IPython.html import widgets
plt.style.use('ggplot')
NUMBER_OF_PINGS = 8
# displaying the text widget
text = widgets.Text(description="Domain to ping", width=200)
display(text)
# preparing the plot
data = pd.DataFrame()
x = range(1,NUMBER_OF_PINGS+1)
plots = dict()
fig, ax = plt.subplots()
plt.xlabel('iterations')
plt.ylabel('ms')
plt.xticks(x)
plt.show()
# preparing a container to put in created checkbox per domain
checkboxes = []
cb_container = widgets.HBox()
display(cb_container)
# add button that updates the graph based on the checkboxes
button = widgets.Button(description="Update the graph")
# function to deal with the added domain name
def handle_submit(sender):
# a part of the magic inside python : pinging
res = !ping -c {NUMBER_OF_PINGS} {text.value}
hits = res.grep('64 bytes').fields(-2).s.replace("time=","").split()
if len(hits) == 0:
print "Domain gave error on pinging"
else:
# rebuild plot based on ping result
data[text.value] = hits
data[text.value] = data[text.value].astype(float)
plots[text.value], = ax.plot(x, data[text.value], label=text.value)
plt.legend()
plt.draw()
# add a new checkbox for the new domain
checkboxes.append(widgets.Checkbox(description = text.value, value=True, width=90))
cb_container.children=[i for i in checkboxes]
if len(checkboxes) == 1:
display(button)
# function to deal with the checkbox update button
def on_button_clicked(b):
for c in cb_container.children:
if not c.value:
plots[c.description].set_visible(False)
else:
plots[c.description].set_visible(True)
plt.legend()
plt.draw()
button.on_click(on_button_clicked)
text.on_submit(handle_submit)
plt.show()
In [ ]: